Laysetwd

setwd("C:/Users/Asus/I3S/Data")
library(Seurat)
## Loading required package: SeuratObject
## Loading required package: sp
## 'SeuratObject' was built with package 'Matrix' 1.7.1 but the current
## version is 1.7.3; it is recomended that you reinstall 'SeuratObject' as
## the ABI for 'Matrix' may have changed
## 
## Attaching package: 'SeuratObject'
## The following objects are masked from 'package:base':
## 
##     intersect, t
library(patchwork)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

load data

# load data1
data_neo <- Read10X(data.dir = "C:/Users/Asus/I3S/Data/neonatal/EL230718_2/filtered_feature_bc_matrix", strip.suffix = TRUE)
# make RNA matrix2
neo <- CreateSeuratObject(counts = data_neo,
                          project = "neo_natal",
                          min.cells = 10,
                          min.features = 200)

# load data2
data_1day <- Read10X(data.dir = "C:/Users/Asus/I3S/Data/1Day/EL230718_3/filtered_feature_bc_matrix", strip.suffix = TRUE)
# make RNA matrix2
day1 <- CreateSeuratObject(counts = data_1day,
                          project = "1_day",
                          min.cells = 10,
                          min.features = 200)

# load data3
data_year10 <- Read10X(data.dir = "C:/Users/Asus/I3S/Data/10Yrs/EL230718_4/filtered_feature_bc_matrix", strip.suffix = TRUE)
# make RNA matrix2
year10 <- CreateSeuratObject(counts = data_year10,
                          project = "10_years",
                          min.cells = 10,
                          min.features = 200)

# load data4
data_year42 <- Read10X(data.dir = "C:/Users/Asus/I3S/Data/42Yrs/EL230718_6/filtered_feature_bc_matrix", strip.suffix = TRUE)
# make RNA matrix2
year42 <- CreateSeuratObject(counts = data_year42,
                          project = "42_years",
                          min.cells = 10,
                          min.features = 200)


# load data5
data_75 <- Read10X(data.dir = "C:/Users/Asus/I3S/Data/75Yrs/EL230718_7/filtered_feature_bc_matrix", strip.suffix = TRUE)
# make RNA matrix2
year75 <- CreateSeuratObject(counts = data_75,
                          project = "75_years",
                          min.cells = 10,  #Alterar de 3 para 10?
                          min.features = 200)

# load data6
data_year87 <- Read10X(data.dir = "C:/Users/Asus/I3S/Data/87Yrs/EL230718_8/filtered_feature_bc_matrix", strip.suffix = TRUE)
# make RNA matrix
year87 <- CreateSeuratObject(counts = data_year87,
                          project = "87_years",
                          min.cells = 10,
                          min.features = 200)

NumberofCounts

# check out number of cells
neocounts <- length(rownames(neo@meta.data))


# check out number of cells
day1counts <- length(rownames(day1@meta.data))


# check out number of cells
year10counts <- length(rownames(year10@meta.data))


# check out number of cells
year42counts <- length(rownames(year42@meta.data))


# check out number of cells
year75counts <- length(rownames(year75@meta.data))


# check out number of cells6
year87counts <- length(rownames(year87@meta.data))

Visualize Data Counts

# Compile counts into a data frame
counts_df <- data.frame(
  Age = c("Neonatal", "1Day", "10Years", "42Years", "75Years", "87Years"),
  NumberofCells = c(neocounts, day1counts, year10counts, year42counts, year75counts, year87counts)
)

# Create a bar graph
library(ggplot2)
library(dplyr)

# Convert Age to a factor with specific levels
counts_df <- counts_df %>%
  mutate(Age = factor(Age, levels = c("Neonatal", "1Day", "10Years", "42Years", "75Years", "87Years")))

# Create the bar plot with text labels above each bar
ggplot(counts_df, aes(x = Age, y = NumberofCells, fill = Age)) +
  geom_bar(stat = "identity") + 
  geom_text(aes(label = NumberofCells, y = NumberofCells + max(NumberofCells) * 0.02),
            color = "black", size = 6, fontface = "bold") +  # Increased from 4 to 6
  scale_fill_manual(values = c("Neonatal" = "#E41A1C", "1Day" = "#377EB8", "10Years" = "#4DAF4A", 
                               "42Years" = "#984EA3", "75Years" = "#FF7F00", "87Years" = "#A65628")) +
  theme_minimal() +
  theme(text = element_text(size = 14),  # Base text size
        axis.title = element_text(size = 16),  # Axis titles
        plot.title = element_text(size = 18)) +  # Plot title
  labs(
    title = "Cell Counts Across Different Ages",
    x = "Age",
    y = "Number of Cells"
  ) +
  theme(legend.position = "none")

Genes Expressed Boxplots

##<10k

# Combine the data into one data frame
data_combined <- data.frame(
  Dataset = rep(c("Neonatal", "1Day", "10Years", "42Years", "75Years", "87Years"),
                times = c(neocounts, day1counts, year10counts,
                          year42counts, year75counts, year87counts)),
  GenesExpressed = c(neo@meta.data$nFeature_RNA, day1@meta.data$nFeature_RNA,
                     year10@meta.data$nFeature_RNA, year42@meta.data$nFeature_RNA,
                     year75@meta.data$nFeature_RNA, year87@meta.data$nFeature_RNA)
)


  
# Create the boxplot with points
library(ggplot2)
library(dplyr)

data_combined <- data_combined %>%
  mutate(Dataset = factor(Dataset, levels = c("Neonatal", "1Day", "10Years", "42Years", "75Years", "87Years")))

# Calculate median for each Dataset
medians <- data_combined %>%
  group_by(Dataset) %>%
  summarise(median_value = median(GenesExpressed))

# Set a fixed y position for all the median labels
fixed_y_position <- max(data_combined$GenesExpressed) + 2  # Adjust the '2' to control label positioning

ggplot(data_combined, aes(x = Dataset, y = GenesExpressed)) +
  geom_boxplot(outliers = FALSE, fill = "lightblue", alpha = 0.6) +
  geom_jitter(aes(color = Dataset), size = 1.2, alpha = 0.6, width = 0.2) +  # Increased point size
  geom_text(data = medians, aes(x = Dataset, y = fixed_y_position, 
                               label = paste("Median:", round(median_value, 1))), 
            color = "black", size = 5) +  # Increased from 3.5 to 5
  theme_minimal() +
  theme(text = element_text(size = 14),
        axis.title = element_text(size = 16),
        plot.title = element_text(size = 18)) +
  scale_color_manual(values = c("#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", "#A65628")) +
  labs(
    title = "Genes Expressed Per Cell Across Datasets",
    x = "Dataset",
    y = "Number of Genes Expressed"
  ) +
  theme(legend.position = "none")

##Quality Control

library(Seurat)
# Merge the datasets
merged_data <- merge(
  neo, 
  y = list(day1, year10, year42, year75, year87), 
  add.cell.ids = c("neo", "day1", "year10", "year42", "year75", "year87"),
  project = "CombinedData"
)

age_colors <- c(
  "Neonatal" = "#E41A1C",  # Red
  "1Day" = "#377EB8",      # Blue
  "10Years" = "#4DAF4A",   # Green
  "42Years" = "#984EA3",   # Purple
  "75Years" = "#FF7F00",   # Orange
  "87Years" = "#A65628"    # Brown
)

# Add metadata columns for mitochondrial and ribosomal gene percentages
merged_data[["percent.mt"]] <- PercentageFeatureSet(merged_data, pattern = "^MT-")
merged_data[["percent.rb"]] <- PercentageFeatureSet(merged_data, pattern = "RP[LS]")
merged_data[["log10GenesPerUMI"]] <- log10(merged_data$nFeature_RNA) / log10(merged_data$nCount_RNA)

# Add an 'Age' column to the metadata
merged_data@meta.data <- merged_data@meta.data %>%
  mutate(Age = case_when(
    grepl("^neo_", rownames(.)) ~ "Neonatal",
    grepl("^day1_", rownames(.)) ~ "1Day",
    grepl("^year10_", rownames(.)) ~ "10Years",
    grepl("^year42_", rownames(.)) ~ "42Years",
    grepl("^year75_", rownames(.)) ~ "75Years",
    grepl("^year87_", rownames(.)) ~ "87Years"
  ))

# Calculate the number of cells for each age group
cell_counts <- merged_data@meta.data %>%
  group_by(Age) %>%
  summarise(Cell_Count = n()) %>%
  mutate(Age_Label = paste0(Age, " (n = ", Cell_Count, ")"))


# Merge the cell counts back into the metadata
merged_data@meta.data <- merged_data@meta.data %>%
  left_join(cell_counts, by = "Age")

rownames(merged_data@meta.data) <- colnames(merged_data@assays$RNA)

# Factor the 'Age' column
merged_data@meta.data$Age <- factor(
  merged_data@meta.data$Age, 
  levels = c("Neonatal", "1Day", "10Years", "42Years", "75Years", "87Years")
)

# Convert Age_Label to a factor with the same order as Age
merged_data@meta.data <- merged_data@meta.data %>%
  mutate(Age_Label = factor(Age_Label, levels = unique(Age_Label[order(Age)])))


# Plotting before filtering
# Plot 1: Number of UMIs per cell
ggplot(merged_data@meta.data, aes(color = Age, x = nCount_RNA, fill = Age)) + 
  geom_density(alpha = 0.2) + 
  scale_x_log10() + 
  theme_classic() +
  ylab("Cell density") +
  geom_vline(xintercept = 500, linetype = "dashed", color = "red") +
  ggtitle("Distribution of UMIs per Cell") +
  scale_color_manual(values = age_colors) +   
  scale_fill_manual(values = age_colors)      

# Plot 2: Distribution of genes detected per cell
ggplot(merged_data@meta.data, aes(color = Age, x = nFeature_RNA, fill = Age)) + 
  geom_density(alpha = 0.2) + 
  scale_x_log10() + 
  theme_classic() +
  geom_vline(xintercept = 300, linetype = "dashed", color = "red") +
  ggtitle("Distribution of Genes Detected per Cell") +
  scale_color_manual(values = age_colors) +   
  scale_fill_manual(values = age_colors) 

# Plot 3: Genes per UMI (Novelty Score)
ggplot(merged_data@meta.data, aes(x = log10GenesPerUMI, color = Age, fill = Age)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  geom_vline(xintercept = 0.8, linetype = "dashed", color = "red") +
  ggtitle("Genes per UMI (Novelty Score)") +
  scale_color_manual(values = age_colors) +   
  scale_fill_manual(values = age_colors) 

# Plot 4: Distribution of mitochondrial gene expression per cell
ggplot(merged_data@meta.data, aes(color = Age, x = percent.mt, fill = Age)) + 
  geom_density(alpha = 0.2) + 
  scale_x_log10() + 
  theme_classic() +
  geom_vline(xintercept = 0.2, linetype = "dashed", color = "red") +
  ggtitle("Distribution of Mitochondrial Gene Expression per Cell") +
  scale_color_manual(values = age_colors) +   
  scale_fill_manual(values = age_colors) 

# Plot 5: Distribution of ribosomal gene expression per cell
ggplot(merged_data@meta.data, aes(color = Age, x = percent.rb, fill = Age)) + 
  geom_density(alpha = 0.2) + 
  scale_x_log10() + 
  theme_classic() +
  geom_vline(xintercept = 0.2, linetype = "dashed", color = "red") +
  ggtitle("Distribution of Ribosomal Gene Expression per Cell") +
  scale_color_manual(values = age_colors) +   
  scale_fill_manual(values = age_colors) 

# Plot 6: Correlation between genes detected and number of UMIs
ggplot(merged_data@meta.data, aes(x = nCount_RNA, y = nFeature_RNA, color = percent.mt)) + 
  geom_point(alpha = 0.6) + 
  scale_colour_gradient(low = "gray90", high = "black") +
  stat_smooth(method = lm) +
  scale_x_log10() + 
  scale_y_log10() + 
  theme_classic() +
  theme(text = element_text(size = 14),
        axis.title = element_text(size = 16),
        plot.title = element_text(size = 18),
        strip.text = element_text(size = 12)) +
  geom_vline(xintercept = 500, color = "red", linetype = "dashed") +  
  geom_hline(yintercept = 250, color = "red", linetype = "dashed") +  
  facet_wrap(~Age_Label) +  # Use the custom label here
  labs(
    title = "Correlation between Genes Detected and Number of UMIs",
    x = "Number of UMIs (log scale)",
    y = "Number of Genes Detected (log scale)",
    color = "Mitochondrial %"
  ) #(Adicionar Linha de media e mediana a cada idade)
## `geom_smooth()` using formula = 'y ~ x'

# Filter the Seurat object BEFORE violin plots

filtered_data <- subset(
  merged_data, 
  subset = (nCount_RNA >= 500) & 
           (nFeature_RNA >= 250) & 
           (log10GenesPerUMI > 0.80) &  
           (percent.mt < 20)
)



# Check quantiles for mitochondrial RNA to determine cutoff
quantile(merged_data@meta.data$percent.mt, probs = c(0.75,  0.8, 0.85, 0.9, 0.95))
##       75%       80%       85%       90%       95% 
##  4.601257  4.834740  5.142649  5.661620 10.420633
quantile(merged_data@meta.data$percent.rb, probs = c(0.75,  0.8, 0.85, 0.9, 0.95))
##      75%      80%      85%      90%      95% 
## 20.78881 21.29882 21.92112 22.69106 24.05267
# For the violin plots, modify to:
VlnPlot(filtered_data, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), 
        pt.size = 0.1) +  # Smaller points
  theme(text = element_text(size = 14),
        axis.title = element_text(size = 16),
        plot.title = element_text(size = 18)) +
  xlab("Age")  # Change x-axis label from "identity" to "Age"

VlnPlot(filtered_data, features = c("nCount_RNA", "nFeature_RNA", "percent.rb"), 
        pt.size = 0.1) +
  theme(text = element_text(size = 14),
        axis.title = element_text(size = 16),
        plot.title = element_text(size = 18)) +
  xlab("Age")

# Reassign to filtered Seurat object
#filtered_data <- CreateSeuratObject(filtered_data, meta.data = filtered_data@meta.data)

#summary(merged_data@meta.data$percent.rb)
#summary(merged_data@meta.data$percent.mt)  ##Update

# Subset the Seurat object based on quality control criteria
##merged_data <- subset(merged_data, subset = nFeature_RNA > 200 & nFeature_RNA < 15000 & percent.mt < 5.5 & percent.rb < 5.5)
# Calculate the number of cells for each age group
cell_counts <- filtered_data@meta.data %>%
  group_by(Age) %>%
  summarise(Cell_Count = n()) %>%
  mutate(Age_Label = paste0(Age, " (n = ", Cell_Count, ")"))


# Merge the cell counts back into the metadata
filtered_data@meta.data <- filtered_data@meta.data %>%
  left_join(cell_counts, by = "Age")

rownames(filtered_data@meta.data) <- colnames(filtered_data@assays$RNA)

# Factor the 'Age' column
filtered_data@meta.data$Age <- factor(
  filtered_data@meta.data$Age, 
  levels = c("Neonatal", "1Day", "10Years", "42Years", "75Years", "87Years")
)

# Convert Age_Label to a factor with the same order as Age
filtered_data@meta.data <- filtered_data@meta.data %>%
  mutate(Age_Label.y = factor(Age_Label.y, levels = unique(Age_Label.y[order(Age)])))


# Plot 1: Number of UMIs per cell
ggplot(filtered_data@meta.data, aes(color = Age, x = nCount_RNA, fill = Age)) + 
  geom_density(alpha = 0.2) + 
  scale_x_log10() + 
  theme_classic() +
  ylab("Cell density") +
  geom_vline(xintercept = 500, linetype = "dashed", color = "red") +
  ggtitle("Distribution of UMIs per Cell") +
  scale_color_manual(values = age_colors) +   
  scale_fill_manual(values = age_colors) 

# Plot 2: Distribution of genes detected per cell
ggplot(filtered_data@meta.data, aes(color = Age, x = nFeature_RNA, fill = Age)) + 
  geom_density(alpha = 0.2) + 
  scale_x_log10() + 
  theme_classic() +
  geom_vline(xintercept = 300, linetype = "dashed", color = "red") +
  ggtitle("Distribution of Genes Detected per Cell") +
  scale_color_manual(values = age_colors) +   
  scale_fill_manual(values = age_colors) 

# Plot 3: Genes per UMI (Novelty Score)
ggplot(filtered_data@meta.data, aes(x = log10(nFeature_RNA) / log10(nCount_RNA), color = Age, fill = Age)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  geom_vline(xintercept = 0.8, linetype = "dashed", color = "red") +
  ggtitle("Genes per UMI (Novelty Score)") +
  scale_color_manual(values = age_colors) +   
  scale_fill_manual(values = age_colors) 

# **New Plot**: Distribution of mitochondrial gene expression per cell
ggplot(filtered_data@meta.data, aes(color = Age, x = percent.mt, fill = Age)) + 
  geom_density(alpha = 0.2) + 
  scale_x_log10() + 
  theme_classic() +
  geom_vline(xintercept = 0.2, linetype = "dashed", color = "red") +
  ggtitle("Distribution of Mitochondrial Gene Expression per Cell") +
  scale_color_manual(values = age_colors) +   
  scale_fill_manual(values = age_colors) 

# **New Plot**: Distribution of ribosomal gene expression per cell
ggplot(filtered_data@meta.data, aes(color = Age, x = percent.rb, fill = Age)) + 
  geom_density(alpha = 0.2) + 
  scale_x_log10() + 
  theme_classic() +
  geom_vline(xintercept = 0.2, linetype = "dashed", color = "red") +
  ggtitle("Distribution of Ribosomal Gene Expression per Cell") +
  scale_color_manual(values = age_colors) +   
  scale_fill_manual(values = age_colors) 

# Visualize the correlation between genes detected and number of UMIs
filtered_data@meta.data %>%
  ggplot(aes(x = nCount_RNA, y = nFeature_RNA, color = percent.mt)) + 
  geom_point(alpha = 0.6) + 
  scale_colour_gradient(low = "gray90", high = "black")+
  stat_smooth(method = lm) + #Adicionar R2 e correlation
  scale_x_log10() + 
  scale_y_log10() + 
  theme_classic() +
  theme(text = element_text(size = 14),
        axis.title = element_text(size = 16),
        plot.title = element_text(size = 18),
        strip.text = element_text(size = 12)) +
  geom_vline(xintercept = 500, color = "red", linetype = "dashed") +  
  geom_hline(yintercept = 250, color = "red", linetype = "dashed") +  
  facet_wrap(~Age_Label.y) + 
  labs(
    title = "Correlation between Genes Detected and Number of UMIs",
    x = "Number of UMIs (log scale)",
    y = "Number of Genes Detected (log scale)",
    color = "Mitochondrial %"
  )
## `geom_smooth()` using formula = 'y ~ x'

Comparison of normalization methods

library(Seurat)

#Log Normalization

##year87_datalog <- NormalizeData(year87_data, normalization.method = "LogNormalize", scale.factor = 10000)
##year87_datalog <- FindVariableFeatures(year87_datalog, selection.method = "vst", nfeatures = 2000)
##all.genes <- rownames(year87_datalog)
##year87_datalog <- ScaleData(year87_datalog, features = all.genes)

##top10var <- head(VariableFeatures(year87_datalog), 10)

# plot variable features with and without labels
##plot1 <- VariableFeaturePlot(year87_datalog)
##top10plot <- LabelPoints(plot = plot1, points = top10var, repel = T)
##top10plot


#SCTransform

##year87_datasct <- SCTransform(year87_data, verbose = FALSE)
##year87_datasct <- FindVariableFeatures(year87_datasct, selection.method = "vst", nfeatures = 2000)
##all.genes <- rownames(year87_datasct)
##year87_datasct <- ScaleData(year87_datasct, features = all.genes)

##top10var <- head(VariableFeatures(year87_datasct), 10)

# plot variable features with and without labels
##plot2 <- VariableFeaturePlot(year87_datasct)
##top10plot <- LabelPoints(plot = plot2, points = top10var, repel = T)
##top10plot

Save Session

##Fout <- "C:/Users/Asus/I3S/Data/SCEAnalysis/Saves/Mysession.RData"
##save.image(Fout,safe = TRUE)